# Dr. M. Baron, Statistical Machine Learning class, STAT-427/627

# APPLICATIONS

1. Matrix computation for Google’s PageRank algorithm

How to solve the matrix equation and obtain Google solution for the textbook example

(Elements of Statistical Learning, pp. 576-578)

> x=c(0,1,1,0,0,0,1,0,1,0,0,0,0,0,1,0)

> L = matrix(x,4,4) # Define a 4x4 matrix L showing linkage among web pages

> L # R reads a matrix column by column

[,1] [,2] [,3] [,4]

[1,] 0 0 1 0

[2,] 1 0 0 0

[3,] 1 1 0 1

[4,] 0 0 0 0

> e = matrix( rep(1,4), 4, 1 ) # Vector of 1s

> e

[,1]

[1,] 1

[2,] 1

[3,] 1

[4,] 1

> C = diag( colSums(L) ) # Diagonal matrix C showing the number

> C # of outgoing links from each web page

[,1] [,2] [,3] [,4]

[1,] 2 0 0 0 # Matrix multiplication is %*%

[2,] 0 1 0 0 # Inverse matrix is solve(C)

[3,] 0 0 1 0 # Transpose matrix is t(e)

[4,] 0 0 0 1

> A = 0.15*(e%*%t(e))/4 + 0.85*L%*%solve(C)

> A

[,1] [,2] [,3] [,4]

[1,] 0.0375 0.0375 0.8875 0.0375

[2,] 0.4625 0.0375 0.0375 0.0375

[3,] 0.4625 0.8875 0.0375 0.8875

[4,] 0.0375 0.0375 0.0375 0.0375

> eigen(A) # Computes eigenvalues and eigenvectors of A

$values

[1] 1.000000e+00+0.00e+00i -4.250000e-01+4.25e-01i

[3] -4.250000e-01-4.25e-01i 5.477659e-18+0.00e+00i

$vectors

[,1] [,2]

[1,] -0.64470397+0i 7.071068e-01+0.000000e+00i

[2,] -0.33889759+0i -3.535534e-01-3.535534e-01i

[3,] -0.68212419+0i -3.535534e-01+3.535534e-01i

[4,] -0.06489841+0i -2.175463e-17-4.017163e-18i

[,3] [,4]

[1,] 7.071068e-01+0.000000e+00i -9.941546e-16+0i

[2,] -3.535534e-01+3.535534e-01i -7.071068e-01+0i

[3,] -3.535534e-01-3.535534e-01i -1.968623e-16+0i

[4,] -2.175463e-17+4.017163e-18i 7.071068e-01+0i

> install.packages("expm”) # This package contains matrix powers

> library(expm) # “Exponentiation of a matrix”

> A100 = A%^%100 # Large power of matrix A

> A100 # (a simple but not an optimal method)

[,1] [,2] [,3] [,4]

[1,] 0.3725269 0.3725269 0.3725269 0.3725269

[2,] 0.1958239 0.1958239 0.1958239 0.1958239

[3,] 0.3941492 0.3941492 0.3941492 0.3941492

[4,] 0.0375000 0.0375000 0.0375000 0.0375000

# “Converged”! All columns are the same

> p = A100[,1] # Take one column of A¹⁰⁰

> p

[1] 0.3725269 0.1958239 0.3941492 0.0375000

> p=t(t(p)) # Now R understands it as a column

> p

[,1]

[1,] 0.3725269

[2,] 0.1958239

[3,] 0.3941492

[4,] 0.0375000

# Verify that it is a solution of p = Ap

> A%*%p

[,1]

[1,] 0.3725269

[2,] 0.1958239

[3,] 0.3941492

[4,] 0.0375000

# p is not properly normalized yet, we need e’p = N = 4

> t(e)%*%p # but we have the sum of p[i] = 1

[,1]

[1,] 1

> p*4 # This is the solution. Ranked by importance, we have

[,1] # web pages ordered as 3, 1, 2, 1

[1,] 1.4901074

[2,] 0.7832956

[3,] 1.5765969

[4,] 0.1500000

# Another way is by iterations (standard method of solving fixed-point equations):

> p = matrix(c(1,0,0,0),4,1) # The initial vector is kind of arbitrary

> p

[,1]

[1,] 1

[2,] 0

[3,] 0

[4,] 0

> for (n in (1:100)){ p = A%*%p } # Iterate p[next] = A*p[current]

> 4*p # Same result

[,1]

[1,] 1.4901074

[2,] 0.7832956

[3,] 1.5765969

[4,] 0.1500000

2. Spam detection

# The Spam data with description are in https://archive.ics.uci.edu/ml/datasets/Spambase

# We can also get its formatted version from a package “kernlab”

> library(kernlab)

> data(spam)

> dim(spam)

[1] 4601 58

> names(spam)

[1] "make" "address" "all" "num3d"

[5] "our" "over" "remove" "internet"

[9] "order" "mail" "receive" "will"

[13] "people" "report" "addresses" "free"

[17] "business" "email" "you" "credit"

[21] "your" "font" "num000" "money"

[25] "hp" "hpl" "george" "num650"

[29] "lab" "labs" "telnet" "num857"

[33] "data" "num415" "num85" "technology"

[37] "num1999" "parts" "pm" "direct"

[41] "cs" "meeting" "original" "project"

[45] "re" "edu" "table" "conference"

[49] "charSemicolon" "charRoundbracket" "charSquarebracket" "charExclamation"

[53] "charDollar" "charHash" "capitalAve" "capitalLong"

[57] "capitalTotal" "type"

> table(type)

type

nonspam spam

2788 1813

# LOGISTIC REGRESSION

> logreg = glm( type ~ ., data=spam, family = "binomial" )

> prob.spam = fitted.values(logreg)

> class.spam = ifelse( prob.spam > 0.5, "spam", "nonspam" )

> table(class.spam, type)

type

class.spam nonspam spam

nonspam 2666 194

spam 122 1619

> mean( type == class.spam )

[1] 0.9313193 # Correct classification rate = 93% (better be validated by CV)

# What are error rates among spam and benign emails?

> sum( type=="nonspam" & class.spam=="spam" ) / sum( type=="nonspam" )

[1] 0.04375897

> sum( type=="spam" & class.spam=="nonspam" ) / sum( type=="spam" )

[1] 0.107005

# Reducing the error rate for nonspam emails, with loss function L(1,0)=1, L(0,1)=10

> class.spam = ifelse( prob.spam > 10/11, "spam", "nonspam" )

> table(class.spam, type)

type

class.spam nonspam spam

nonspam 2759 674

spam 29 1139

> mean( type == class.spam )

[1] 0.8472071

> sum( type=="nonspam" & class.spam=="spam" ) / sum( type=="nonspam" )

[1] 0.01040172

> sum( type=="spam" & class.spam=="nonspam" ) / sum( type=="spam" )

[1] 0.3717595

# The overall error rate increased, and the error rate of misclassifying spam emails as nonspam increased substantially, but the error rate for benign emails is only 1% now.

# DECISION TREES

> tr = tree( type ~ ., data=spam )

> plot(tr); text(tr);

> summary(tr)

Misclassification error rate: 0.08259 = 380 / 4601

# Optimization – random forests

> library(randomForest)

> rf = randomForest( type ~ ., data=spam )

> varImpPlot(rf) # To see which variables are most significant in a tree based classification

> rf

Number of trees: 500

No. of variables tried at each split: 7 # That’s approx. square root of the number of X-variables

OOB estimate of error rate: 4.52% # The lowest classification rate we could get so far

Confusion matrix:

nonspam spam class.error

nonspam 2710 78 0.02797704

spam 130 1683 0.07170436

# DISCRIMINANT ANALYSIS

> library(MASS)

> LDA = lda( type ~ ., data=spam, CV=TRUE )

> table( type, LDA$class )

type nonspam spam

nonspam 2656 132

spam 394 1419

> mean( type != LDA$class )

[1] 0.114323 # This error rate is higher

> QDA = qda( type ~ ., data=spam, CV=TRUE )

> table( type, QDA$class )

type nonspam spam

nonspam 2090 691

spam 87 1722

> mean( type != QDA$class )

[1] NA

> summary(QDA$class)

nonspam spam NA's

2177 2413 11 # There are missing values, predictions not computed by QDA

# We’ll remove these points

> mean( type != QDA$class, na.rm=TRUE )

[1] 0.1694989 # No improvement with QDA; linear models are ok

SUPPORT VECTOR MACHINES

> library(e1071)

> SVM = svm( type ~ ., data=spam, kernel="linear" )

> summary(SVM)

Number of Support Vectors: 940

( 456 484 ) # 940 support vectors (not cleanly classified) out of 4601 data points

> dim(spam)

[1] 4601 58

> class = predict(SVM)

> table(class,type)

type

class nonspam spam

nonspam 2663 185

spam 125 1628

> mean( class != type )

[1] 0.06737666 # Best rate so far?